In [1]:
import numpy as np
import pandas as pd
import scipy.special as sp
from scipy.stats import norm, beta, gamma, invgamma
import scipy.stats as st
from scipy.optimize import fsolve
import arviz as az
import graphviz
import matplotlib.pyplot as plt
import seaborn as sns
import pymc as pm
import pymc_bart as pmb
import xarray as xr
import operator
import pytensor.tensor as pt
from operator import itemgetter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit

from itertools import chain
from IPython.display import display, Markdown

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = [10, 6]
plt.rcParams["figure.dpi"] = 100
plt.rcParams["figure.facecolor"] = "white"

sns_c = sns.color_palette(palette='tab10')

Data Dictionary¶

  • date : Format YYYY-mm-dd
  • season : season (1:winter, 2:spring, 3:summer, 4:fall)
  • yr : year (0: 2011, 1:2012)
  • mnth : month ( 1 to 12)
  • holiday : weather day is holiday or not
  • weekday : day of the week
  • workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
  • weathersit :
    • 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    • 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    • 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  • temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39
  • atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
  • hum: Normalized humidity. The values are divided to 100 (max)
  • windspeed: Normalized wind speed. The values are divided to 67 (max)
  • casual: count of casual users
  • registered: count of registered users
  • cnt: count of total rental bikes including both casual and registered

Data Processing¶

In [2]:
# Daily Dataset from: https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset
df = pd.read_csv('day.csv').drop('instant', axis = 1)
df.rename(columns={"dteday":"date", "cnt":"rentals"}, inplace = True)
df['date'] = pd.to_datetime(df['date'])
df['intercept'] = 1.0
df['temp_diffs'] = df['temp'].diff()

# Need to scale Rentals
target_scaler = MinMaxScaler()
rentals = df['rentals'].to_numpy()
rentals_scaled = target_scaler.fit_transform(X=rentals.reshape(-1,1)).flatten()

features = [
    'intercept', 'holiday', 'workingday', 
    'weathersit','temp', 'hum', 'windspeed', 
    'season'
]

X = df.loc[:, features].to_numpy(copy=True)
Y = df.loc[:, 'rentals'].to_numpy(copy = True)

tscv = TimeSeriesSplit(n_splits = 2, gap = 0, test_size = 5)

# Create Train/Test Splits
_, (train_idx, test_idx) = tscv.split(X)

X_train, X_test = X[train_idx, :], X[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]

assert len(X_train) == len(Y_train)
assert len(X_test) == len(Y_test)

# n = Number of rows and p = number of predictors including intercept
n, p = X.shape 
print(f"Number of Rows: {n} | Number of Predictors: {p}")
print()

df.loc[:, features + ['temp_diffs', 'rentals']]
Number of Rows: 731 | Number of Predictors: 8

Out[2]:
intercept holiday workingday weathersit temp hum windspeed season temp_diffs rentals
0 1.0 0 0 2 0.344167 0.805833 0.160446 1 NaN 985
1 1.0 0 0 2 0.363478 0.696087 0.248539 1 0.019311 801
2 1.0 0 1 1 0.196364 0.437273 0.248309 1 -0.167114 1349
3 1.0 0 1 1 0.200000 0.590435 0.160296 1 0.003636 1562
4 1.0 0 1 1 0.226957 0.436957 0.186900 1 0.026957 1600
... ... ... ... ... ... ... ... ... ... ...
726 1.0 0 1 2 0.254167 0.652917 0.350133 1 0.010834 2114
727 1.0 0 1 2 0.253333 0.590000 0.155471 1 -0.000834 3095
728 1.0 0 0 2 0.253333 0.752917 0.124383 1 0.000000 1341
729 1.0 0 0 1 0.255833 0.483333 0.350754 1 0.002500 1796
730 1.0 0 1 2 0.215833 0.577500 0.154846 1 -0.040000 2729

731 rows × 10 columns

EDA¶

In [3]:
df.loc[:, features + ['rentals']].describe().round(2)
Out[3]:
intercept holiday workingday weathersit temp hum windspeed season rentals
count 731.0 731.00 731.00 731.00 731.00 731.00 731.00 731.00 731.00
mean 1.0 0.03 0.68 1.40 0.50 0.63 0.19 2.50 4504.35
std 0.0 0.17 0.47 0.54 0.18 0.14 0.08 1.11 1937.21
min 1.0 0.00 0.00 1.00 0.06 0.00 0.02 1.00 22.00
25% 1.0 0.00 0.00 1.00 0.34 0.52 0.13 2.00 3152.00
50% 1.0 0.00 1.00 1.00 0.50 0.63 0.18 3.00 4548.00
75% 1.0 0.00 1.00 2.00 0.66 0.73 0.23 3.00 5956.00
max 1.0 1.00 1.00 3.00 0.86 0.97 0.51 4.00 8714.00
In [4]:
fig, ax = plt.subplots(nrows= 1, ncols = 2, figsize=(15,5))
sns.kdeplot(x='rentals', data=df, fill=True, color='blue', ax = ax[0])
sns.kdeplot(data=rentals_scaled, fill=True, color='blue', ax = ax[1])

ax[0].set_title('Kernel Density: Bike Rentals')
ax[1].set_title('Kernel Density: Scaled Bike Rentals')
fig.suptitle('Bike Rentals', y=1);
In [5]:
def plot_numeric_features(numeric_features, target, save_fig = True):

    fig, axes = plt.subplots(
        nrows=len(numeric_features) + 1,
        ncols=1,
        figsize=(15, 12),
        constrained_layout=True
    )

    sns.lineplot(x='date', y=target, data=df, color='red', ax=axes[0])
    axes[0].set(title=target, ylabel='Rentals')

    for i, feature in enumerate(numeric_features):
        ax = axes[i+1]
        sns.lineplot(
            x='date',
            y=feature,
            data=df,
            color=sns_c[i],
            ax=ax
        )
        ax.set(title=feature, ylabel='Normalized Value')
        
    if save_fig:
        plt.savefig('numeric_features.png')
        
    plt.show()
    

numeric_features = [
    'temp',
    'temp_diffs',
    'hum',
    'windspeed',
]
        
plot_numeric_features(numeric_features, 'rentals', save_fig = True)
In [6]:
def plot_cat_features(df, cat_cols, y, save_fig = True):

    fig, axes = plt.subplots(
        nrows=3,
        ncols=1,
        figsize=(10, 8),
        constrained_layout=True
    )
    axes = axes.flatten()

    for i, feature in enumerate(cat_cols):
        ax = axes[i]

        feature_df = df \
            .groupby(feature, as_index=False) \
            .agg({y: np.mean}) \
            .sort_values(y)

        sns.barplot(x=feature, y=y, data=feature_df, dodge=False, ax=ax)
        ax.set(title=feature, ylabel=None)

    fig.suptitle(f'Average # of Bike Rentals over Categorical Features')
    
    if save_fig:
        plt.savefig('cat_features.png')
    
    plt.show()

    
cat_features = [
    'holiday',
    'workingday',
    'weathersit',
]
plot_cat_features(df, cat_features, 'rentals', save_fig = True)

Regression Analysis¶

PyMC Model Setup¶

In [7]:
date = df['date'].to_numpy()
intercept = df['intercept'].to_numpy()
temp = df['temp'].to_numpy()
hum = df['hum'].to_numpy()
windspeed = df['windspeed'].to_numpy()

# Categorical Variables - Needed for ZeroSumNormal Dist
holiday_idx, holiday = df['holiday'].factorize()
workingday_idx, workingday = df['workingday'].factorize()
weathersit_idx, weathersit = df['weathersit'].factorize()

# Categorical Dist Probabilities
# 12 Holidays: https://dchr.dc.gov/page/holiday-schedules
p_holiday = 12/365

# Weekdays / # of Days in Week
p_working_day = 5/7

# More informative prior based on typical Washington Data
p_weathersit = [6/10, 3/10, 1/10]

Multivariate Regression: Base Model - Intercept + Temperature Only (Student T Likelihood)¶

In [8]:
rng = np.random.default_rng(42)

coords = {
    "date": date,
}
with pm.Model(coords=coords) as base_model:
    # Intercept Prior
    intercept = pm.Normal(name="intercept", mu = 0, sigma = 1)
    
    # Beta Coefficient Priors (Informative) - Numeric 
    # Temp: https://www.weather.gov/media/lwx/climate/dcatemps.pdf
    beta_temp = pm.Normal("beta_temp", mu = 0.44, sigma = 0.2)
    
    # Likelihood precision/variance priors
    nu = pm.Gamma("nu", alpha = 8, beta = 2)
    sigma = pm.HalfNormal("sigma", sigma = 1)

    # Regression Function
    mu = pm.Deterministic("mu", var = intercept + beta_temp * temp, dims="date")

    # Likelihood
    # Use StudentT over Normal since it handles small samples better than Normal Dist
    likelihood = pm.StudentT(
        name="likelihood", 
        mu = mu, 
        nu = nu,
        sigma = sigma, 
        observed = rentals_scaled,
        dims="date",
    )

    base_trace = pm.sample(
        draws = 10000,
        chains = 4,
        tune = 2000,
        cores = 4,
        idata_kwargs = {"log_likelihood": True},
        random_seed = rng,
    )
    prior_pred_base = pm.sample_prior_predictive(samples = 1000, random_seed = rng)
    ppc_base = pm.sample_posterior_predictive(
        trace = base_trace, random_seed = rng, extend_inferencedata = True,
    )
    
    
az.summary(base_trace, hdi_prob = 0.95, var_names = ["~mu"], round_to = 3)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_temp, nu, sigma]
100.00% [48000/48000 00:30<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 10_000 draw iterations (8_000 + 40_000 draws total) took 47 seconds.
Sampling: [beta_temp, intercept, likelihood, nu, sigma]
Sampling: [likelihood]
100.00% [40000/40000 00:07<00:00]
Out[8]:
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
intercept 0.134 0.018 0.098 0.169 0.000 0.000 20991.846 23310.367 1.0
beta_temp 0.765 0.035 0.697 0.833 0.000 0.000 20779.664 23287.662 1.0
nu 10.521 1.735 7.289 13.976 0.011 0.008 26722.969 26427.229 1.0
sigma 0.163 0.005 0.153 0.173 0.000 0.000 26790.736 25262.860 1.0

Prior & Posterior Predictive Checks¶

In [9]:
def plot_prior_predictive(ppc, title, model_name, save_fig = True):
    # Code referenced from: https://www.pymc.io/projects/examples/en/latest/time_series/Forecasting_with_structural_timeseries.html
    palette = "plasma"
    cmap = plt.get_cmap(palette)
    percs = np.linspace(51, 99, 100)
    colors = (percs - np.min(percs)) / (np.max(percs) - np.min(percs))

    fig, ax = plt.subplots(figsize=(10, 4))

    for i, p in enumerate(percs[::-1]):
        upper = np.percentile(ppc.prior_predictive["likelihood"], p, axis = 1)
        lower = np.percentile(ppc.prior_predictive["likelihood"], 100 - p, axis = 1)
        
        color_val = colors[i]
        
        ax.fill_between(
            x=date,
            y1=upper.flatten(),
            y2=lower.flatten(),
            color=cmap(color_val),
            alpha=0.1,
        )
    sns.lineplot(x=date, y=rentals_scaled, color="k", ax=ax, label="Rentals Scaled")
    ax.set(title=f"{title} - Prior Predictive Fit", xlabel="date", ylabel="Scaled Bike rentals")
    
    ax.legend()
    plt.tight_layout()
    
    if save_fig:
        fig.savefig(f'prior_predictive_{model_name}')
    
    plt.plot()

    
plot_prior_predictive(prior_pred_base, title = 'Base Model: Intercept + Temperature', model_name = 'baseline_lin_reg_0.png', save_fig = True)
In [10]:
def plot_posterior_predictive(ppc, title, model_name, save_fig = True):

    # Code referenced from: https://www.pymc.io/projects/examples/en/latest/time_series/Forecasting_with_structural_timeseries.html
    palette = "Wistia"
    cmap = plt.get_cmap(palette)
    percs = np.linspace(51, 99, 100)
    colors = (percs - np.min(percs)) / (np.max(percs) - np.min(percs))

    ppc_likelihood = ppc.posterior_predictive["likelihood"].stack(sample=("chain", "draw"))

    # Need to unscale our rentals data to get a proper view for posterior fit 
    ppc_likelihood_inv = target_scaler.inverse_transform(
        X = ppc_likelihood.to_numpy()
    )

    fig, ax = plt.subplots(figsize=(12, 6))

    for i, p in enumerate(percs[::-1]):
        upper = np.percentile(ppc_likelihood_inv, p, axis = 1)
        lower = np.percentile(ppc_likelihood_inv, 100 - p, axis = 1)
        color_val = colors[i]
        ax.fill_between(
            x=date,
            y1=upper,
            y2=lower,
            color=cmap(color_val),
            alpha=0.05,
        )

    sns.lineplot(
        x=date,
        y=ppc_likelihood_inv.mean(axis=1),
        color="blue",
        label="posterior predictive mean",
        ax=ax,
    )
    sns.lineplot(
        x=date,
        y=rentals,
        color="black",
        label="rentals",
        ax=ax,
    )
    ax.legend(loc="upper left")
    ax.set(
        title=f"{title} - Posterior Predictive Fit",
        xlabel="date",
        ylabel="rentals",
    )
    plt.tight_layout()
    
    if save_fig:
        fig.savefig(f'posterior_predictive_{model_name}')
        
    plt.show()

    
    
plot_posterior_predictive(ppc_base, 'Base Model: Intercept + Temperature', model_name = 'baseline_lin_reg_0.png', save_fig = True)

Multivariate Regression: Base Model - Intercept + Temperature Only (Normal Likelihood)¶

In [11]:
rng = np.random.default_rng(42)

coords = {
    "date": date,
}
with pm.Model(coords=coords) as base_model_normal:
    # Intercept Prior
    intercept = pm.Normal(name="intercept", mu = 0, sigma = 1)
    
    # Beta Coefficient Priors (Informative) - Numeric 
    # Temp: https://www.weather.gov/media/lwx/climate/dcatemps.pdf
    beta_temp = pm.Normal("beta_temp", mu = 0.44, sigma = 0.2)
    
    # Likelihood precision/variance priors
    sigma = pm.HalfNormal("sigma", sigma = 1)
    

    # Regression Function
    mu = pm.Deterministic("mu", var = intercept + beta_temp * temp, dims="date")

    # Likelihood
    # Use StudentT over Normal since it handles small samples better than Normal Dist
    likelihood = pm.Normal(
        name="likelihood", 
        mu = mu,
        sigma = sigma, 
        observed = rentals_scaled,
        dims="date",
    )

    base_trace_normal = pm.sample(
        draws = 10000,
        chains = 4,
        tune = 2000,
        cores = 4,
        idata_kwargs = {"log_likelihood": True},
        random_seed = rng,
    )
    prior_pred_base_normal = pm.sample_prior_predictive(samples = 1000, random_seed = rng)
    ppc_base_normal = pm.sample_posterior_predictive(
        trace = base_trace_normal, random_seed = rng, extend_inferencedata = True,
    )
    
    
az.summary(base_trace_normal, hdi_prob = 0.95, var_names = ["~mu"], round_to = 3)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_temp, sigma]
100.00% [48000/48000 00:18<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 10_000 draw iterations (8_000 + 40_000 draws total) took 34 seconds.
Sampling: [beta_temp, intercept, likelihood, sigma]
Sampling: [likelihood]
100.00% [40000/40000 00:03<00:00]
Out[11]:
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
intercept 0.142 0.018 0.106 0.178 0.0 0.0 13586.162 16185.476 1.0
beta_temp 0.755 0.035 0.688 0.823 0.0 0.0 13729.422 16303.977 1.0
sigma 0.174 0.005 0.165 0.183 0.0 0.0 22226.736 21269.382 1.0

Prior & Posterior Predictive Checks¶

In [12]:
plot_prior_predictive(prior_pred_base_normal, title = 'Base Model: Intercept + Temperature (Normal Likelihood)', model_name = 'baseline_lin_reg_1_normal.png', save_fig = True)
In [13]:
plot_posterior_predictive(ppc_base_normal, 'Base Model: Intercept + Temperature (Normal Likelihood)', model_name = 'baseline_lin_reg_1_normal.png', save_fig = True)

Mean Response Check: Baseline Model¶

In [14]:
post = ppc_base.posterior
mu_pp = post["intercept"] + post["beta_temp"] * xr.DataArray(temp, dims=["date"])

lik = base_trace_normal.posterior_predictive["likelihood"]

fig, ax = plt.subplots(figsize=(10, 5))

sns.lineplot(x=temp, y=mu_pp.mean(("chain", "draw")), label="Posterior Predictive Mean Response", color="blue", alpha=0.6, ax = ax)
sns.scatterplot(x=temp, y=rentals_scaled, label = "Observed Data", ax = ax)
az.plot_hdi(temp, lik, hdi_prob = 0.95, fill_kwargs={'alpha': 0.1, "label" : "$95\%$ HDI"}, ax = ax)

plt.xlabel("Temperature (scaled)")
plt.ylabel("Bike Rentals (scaled)")
plt.title('Mean Response Vs. Observed Data With 95% HDI')
plt.tight_layout()

save_fig = True
if save_fig:
    plt.savefig('mean_response_baseline_lin_reg_0.png')
    
plt.show()

Multivariate Regression: Full Model - All Features (Student T Likelihood) With Uninformative Normal Categorical Priors¶

In [15]:
rng = np.random.default_rng(42)

coords = {
    "date": date,
    "workingday": workingday,
    "weathersit": weathersit,
    "holiday": holiday,
}
with pm.Model(coords=coords) as full_model_1:
    # Intercept Prior
    intercept = pm.Normal(name="intercept", mu = 0, sigma = 1)
    
    # Beta Coefficient Priors (Informative) - Numeric 
    # Temp: https://www.weather.gov/media/lwx/climate/dcatemps.pdf
    beta_temp = pm.Normal("beta_temp", mu = 0.5, sigma = 0.2)
    # Humidity: https://www.worlddata.info/america/usa/climate-washington-d-c.php
    beta_hum = pm.Normal("beta_hum", mu = 0.5, sigma = 0.1)
    # https://weatherspark.com/y/20957/Average-Weather-in-Washington-D.C.;-United-States-Year-Round#Sections-Wind
    beta_windspeed = pm.Normal("beta_windspeed", mu = 0.2, sigma = 0.2)
    
    # Beta Coefficient Priors - Categorical
    beta_holiday = pm.Normal("beta_holiday", mu = 0, sigma=1, shape = len(df['holiday'].unique()))
    beta_workingday = pm.Normal("beta_workingday", mu = 0, sigma=1, shape = len(df['workingday'].unique()))
    beta_weathersit = pm.Normal("beta_weathersit", mu = 0, sigma=1, shape = len(df['weathersit'].unique()))
    
    # Likelihood precision/variance priors
    nu = pm.Gamma("nu", alpha = 2, beta = 0.1)
    sigma = pm.HalfNormal("sigma", sigma = 1)

    # Regression Function
    mu = pm.Deterministic(
        "mu",
        var = (
            intercept
            + beta_temp * temp
            + beta_hum * hum
            + beta_windspeed * windspeed
            + beta_holiday[holiday_idx]
            + beta_workingday[workingday_idx]
            + beta_weathersit[weathersit_idx]
        ),
        dims="date",
    )

    # Likelihood
    # Use StudentT over Normal since it supposedly handles small samples better than Normal Dist
    likelihood = pm.StudentT(
        name="likelihood", 
        mu = mu, 
        nu = nu,
        sigma = sigma, 
        observed = rentals_scaled,
        dims="date",
    )

    full_trace_1 = pm.sample(
        draws = 5000,
        chains = 4,
        tune = 2000,
        cores = 4,
        # target_accept=0.95,
        idata_kwargs = {"log_likelihood": True},
        random_seed = rng,
    )
    prior_pred_full_1 = pm.sample_prior_predictive(samples = 1000, random_seed = rng)
    ppc_full_1 = pm.sample_posterior_predictive(
        trace = full_trace_1, random_seed = rng, extend_inferencedata = True,
    )
    
az.summary(full_trace_1, hdi_prob = 0.95, var_names = ["~mu"], round_to = 3)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_temp, beta_hum, beta_windspeed, beta_holiday, beta_workingday, beta_weathersit, nu, sigma]
100.00% [28000/28000 19:47<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 5_000 draw iterations (8_000 + 20_000 draws total) took 1204 seconds.
Sampling: [beta_holiday, beta_hum, beta_temp, beta_weathersit, beta_windspeed, beta_workingday, intercept, likelihood, nu, sigma]
Sampling: [likelihood]
100.00% [20000/20000 00:03<00:00]
Out[15]:
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
intercept 0.058 0.760 -1.409 1.554 0.007 0.005 12956.439 13142.530 1.0
beta_temp 0.711 0.034 0.646 0.777 0.000 0.000 26025.263 15443.495 1.0
beta_hum -0.027 0.050 -0.128 0.066 0.000 0.000 21198.777 15330.516 1.0
beta_windspeed -0.293 0.076 -0.444 -0.146 0.001 0.000 22736.322 15271.655 1.0
beta_holiday[0] 0.048 0.627 -1.161 1.270 0.006 0.004 10725.708 11805.765 1.0
beta_holiday[1] -0.022 0.627 -1.266 1.165 0.006 0.004 10780.999 11790.271 1.0
beta_workingday[0] 0.025 0.623 -1.195 1.258 0.006 0.004 10998.951 12716.565 1.0
beta_workingday[1] 0.040 0.623 -1.200 1.255 0.006 0.004 11008.971 12465.386 1.0
beta_weathersit[0] 0.061 0.538 -0.969 1.142 0.006 0.004 8265.035 11263.958 1.0
beta_weathersit[1] 0.127 0.538 -0.910 1.196 0.006 0.004 8254.195 11106.057 1.0
beta_weathersit[2] -0.151 0.538 -1.173 0.939 0.006 0.004 8304.970 11319.666 1.0
nu 47.849 17.373 19.226 82.539 0.109 0.085 26742.218 14261.577 1.0
sigma 0.160 0.004 0.151 0.168 0.000 0.000 23120.135 14653.949 1.0
In [16]:
az.summary(full_trace_1, hdi_prob = 0.95, var_names = ["~mu"], round_to = 3)
#.to_latex()
Out[16]:
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
intercept 0.058 0.760 -1.409 1.554 0.007 0.005 12956.439 13142.530 1.0
beta_temp 0.711 0.034 0.646 0.777 0.000 0.000 26025.263 15443.495 1.0
beta_hum -0.027 0.050 -0.128 0.066 0.000 0.000 21198.777 15330.516 1.0
beta_windspeed -0.293 0.076 -0.444 -0.146 0.001 0.000 22736.322 15271.655 1.0
beta_holiday[0] 0.048 0.627 -1.161 1.270 0.006 0.004 10725.708 11805.765 1.0
beta_holiday[1] -0.022 0.627 -1.266 1.165 0.006 0.004 10780.999 11790.271 1.0
beta_workingday[0] 0.025 0.623 -1.195 1.258 0.006 0.004 10998.951 12716.565 1.0
beta_workingday[1] 0.040 0.623 -1.200 1.255 0.006 0.004 11008.971 12465.386 1.0
beta_weathersit[0] 0.061 0.538 -0.969 1.142 0.006 0.004 8265.035 11263.958 1.0
beta_weathersit[1] 0.127 0.538 -0.910 1.196 0.006 0.004 8254.195 11106.057 1.0
beta_weathersit[2] -0.151 0.538 -1.173 0.939 0.006 0.004 8304.970 11319.666 1.0
nu 47.849 17.373 19.226 82.539 0.109 0.085 26742.218 14261.577 1.0
sigma 0.160 0.004 0.151 0.168 0.000 0.000 23120.135 14653.949 1.0

Prior & Posterior Predictive Checks¶

In [17]:
plot_prior_predictive(prior_pred_full_1, title = 'Full Model: All Features With Normal Uninformative Priors', model_name = 'full_lin_reg_0_norm', save_fig = True)
In [18]:
az.plot_ppc(full_trace_1, kind="cumulative", num_pp_samples=1000)
plt.show()
In [19]:
plot_posterior_predictive(ppc_full_1, 'Full Model: All Features With Normal Uninformative Priors', model_name = 'full_lin_reg_0_norm', save_fig = True)

Multivariate Regression: Full Model - All Features (Student T Likelihood) With ZeroSumNormal Categorical Priors¶

In [20]:
rng = np.random.default_rng(42)

coords = {
    "date": date,
    "workingday": workingday,
    "weathersit": weathersit,
    "holiday": holiday,
}
with pm.Model(coords=coords) as full_model_2:
    # Intercept Prior
    intercept = pm.Normal(name="intercept", mu = 0, sigma = 1)
    
    # Beta Coefficient Priors (Informative) - Numeric 
    # Temp: https://www.weather.gov/media/lwx/climate/dcatemps.pdf
    beta_temp = pm.Normal("beta_temp", mu = 0.5, sigma = 0.2)
    # Humidity: https://www.worlddata.info/america/usa/climate-washington-d-c.php
    beta_hum = pm.Normal("beta_hum", mu = 0.5, sigma = 0.1)
    # https://weatherspark.com/y/20957/Average-Weather-in-Washington-D.C.;-United-States-Year-Round#Sections-Wind
    beta_windspeed = pm.Normal("beta_windspeed", mu = 0.2, sigma = 0.2)
    
    # Beta Coefficient Priors - Categorical
    beta_holiday = pm.ZeroSumNormal("beta_holiday", sigma=1, dims="holiday")
    beta_workingday = pm.ZeroSumNormal("beta_workingday", sigma=1, dims="workingday")
    beta_weathersit = pm.ZeroSumNormal("beta_weathersit", sigma=1, dims="weathersit")
    
    # Likelihood precision/variance priors
    nu = pm.Gamma("nu", alpha = 2, beta = 0.1)
    sigma = pm.HalfNormal("sigma", sigma = 1)

    # Regression Function
    mu = pm.Deterministic(
        "mu",
        var = (
            intercept
            + beta_temp * temp
            + beta_hum * hum
            + beta_windspeed * windspeed
            + beta_holiday[holiday_idx]
            + beta_workingday[workingday_idx]
            + beta_weathersit[weathersit_idx]
        ),
        dims="date",
    )

    # Likelihood
    # Use StudentT over Normal since it handles small samples better than Normal Dist
    likelihood = pm.StudentT(
        name="likelihood", 
        mu = mu, 
        nu = nu,
        sigma = sigma, 
        observed = rentals_scaled,
        dims="date",
    )

    full_trace_2 = pm.sample(
        draws = 10000,
        chains = 4,
        tune = 2000,
        cores = 4,
        # target_accept=0.95,
        idata_kwargs = {"log_likelihood": True},
        random_seed = rng,
    )
    prior_pred_full_2 = pm.sample_prior_predictive(samples = 1000, random_seed = rng)
    ppc_full_2 = pm.sample_posterior_predictive(
        trace = full_trace_2, random_seed = rng, extend_inferencedata = True,
    )
    
# az.summary(full_trace_2, hdi_prob = 0.95, var_names = ["~mu"], round_to = 2).to_latex(float_format="{:.2f}".format)
az.summary(full_trace_2, hdi_prob = 0.95, var_names = ["~mu"], round_to = 3)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_temp, beta_hum, beta_windspeed, beta_holiday, beta_workingday, beta_weathersit, nu, sigma]
100.00% [48000/48000 02:38<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 10_000 draw iterations (8_000 + 40_000 draws total) took 177 seconds.
Sampling: [beta_holiday, beta_hum, beta_temp, beta_weathersit, beta_windspeed, beta_workingday, intercept, likelihood, nu, sigma]
Sampling: [likelihood]
100.00% [40000/40000 00:07<00:00]
Out[20]:
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
intercept 0.115 0.049 0.021 0.211 0.000 0.000 24358.682 26350.491 1.0
beta_temp 0.710 0.034 0.645 0.778 0.000 0.000 45202.443 28620.694 1.0
beta_hum -0.026 0.050 -0.121 0.073 0.000 0.000 31758.058 29824.459 1.0
beta_windspeed -0.292 0.077 -0.439 -0.141 0.000 0.000 39393.951 30517.045 1.0
beta_holiday[0] 0.035 0.019 -0.002 0.071 0.000 0.000 43967.064 28920.652 1.0
beta_holiday[1] -0.035 0.019 -0.071 0.002 0.000 0.000 43967.064 28920.652 1.0
beta_workingday[0] -0.007 0.007 -0.020 0.006 0.000 0.000 52162.214 29467.680 1.0
beta_workingday[1] 0.007 0.007 -0.006 0.020 0.000 0.000 52162.214 29467.680 1.0
beta_weathersit[2] 0.049 0.014 0.021 0.076 0.000 0.000 37835.998 31485.796 1.0
beta_weathersit[1] 0.115 0.016 0.085 0.146 0.000 0.000 26606.156 29729.508 1.0
beta_weathersit[3] -0.164 0.025 -0.212 -0.113 0.000 0.000 27604.746 26454.208 1.0
nu 48.226 17.749 18.832 83.390 0.082 0.064 51046.917 29173.238 1.0
sigma 0.160 0.005 0.151 0.169 0.000 0.000 46896.264 28663.952 1.0

Prior & Posterior Predictive Checks¶

In [21]:
plot_prior_predictive(prior_pred_full_2, title = 'Full Model: All Features With ZeroSumCategorical Priors', model_name = 'full_lin_reg_0_zsn', save_fig = True)
In [22]:
az.plot_ppc(full_trace_2, kind="cumulative", num_pp_samples=1000)
plt.show()
In [23]:
plot_posterior_predictive(ppc_full_2, 'Full Model: All Features With ZeroSumCategorical Priors', model_name = 'full_lin_reg_0_zsn', save_fig = True)

Gaussian Random Walk¶

In [24]:
rng = np.random.default_rng(42)

coords = {
    "date": date,
}
with pm.Model(coords=coords) as grw_model:
    # Intercept Prior
    intercept = pm.Normal(name="intercept", mu = 0, sigma = 1)
    
    # Beta Coefficient Priors (Informative) - Numeric 
    # Temp: https://www.weather.gov/media/lwx/climate/dcatemps.pdf
    # beta_temp = pm.Normal("beta_temp", mu = 0.5, sigma = 0.2)
    # Humidity: https://www.worlddata.info/america/usa/climate-washington-d-c.php
    beta_hum = pm.Normal("beta_hum", mu = 0.5, sigma = 0.1)
    # https://weatherspark.com/y/20957/Average-Weather-in-Washington-D.C.;-United-States-Year-Round#Sections-Wind
    beta_windspeed = pm.Normal("beta_windspeed", mu = 0.2, sigma = 0.2)
    
    # Beta Coefficient Priors - Categorical
    beta_holiday = pm.ZeroSumNormal("beta_holiday", sigma=1, shape = 2) # 0/1 Binary
    beta_workingday = pm.ZeroSumNormal("beta_workingday", sigma=1, shape = 2) # 0/1 Binary
    beta_weathersit = pm.ZeroSumNormal("beta_weathersit", sigma=1, shape = 3) # 3 unique values
    
    # Likelihood precision/variance priors
    nu = pm.Gamma("nu", alpha = 2, beta = 0.1)
    sigma = pm.HalfNormal("sigma", sigma = 1)

    
    # Random Walk Priors
    step_size = pm.Normal('step_size', mu = 0, sigma = 0.1)
    beta_temp = pm.GaussianRandomWalk(
        name="beta_temp",
        sigma=step_size, # How far can we deviate from the last value
        init_dist=pm.Normal.dist(mu = 0, sigma = 1),
        dims="date",
    )

    # Regression Function
    mu = pm.Deterministic(
        "mu",
        var = (
            beta_temp * temp
            + beta_hum * hum
            + beta_windspeed * windspeed
            + beta_holiday[holiday_idx]
            + beta_workingday[workingday_idx]
            + beta_weathersit[weathersit_idx]
        ),
    )

    # Likelihood
    # Use StudentT over Normal since it handles small samples better than Normal Dist
    likelihood = pm.StudentT(
        name="likelihood", 
        mu = mu, 
        nu = nu,
        sigma = sigma, 
        observed = rentals_scaled,
    )

    grw_trace = pm.sample(
        draws = 10000,
        chains = 4,
        tune = 2000,
        cores = 4,
        target_accept=0.95,
        idata_kwargs = {"log_likelihood": True},
        random_seed = rng,
    )
    # Prior predictive seems to break PyMC
    # grw_prior_trace = pm.sample_prior_predictive(samples = 1000, random_seed = rng)
    grw_ppc_base = pm.sample_posterior_predictive(
        trace = grw_trace, random_seed = rng, extend_inferencedata = True,
    )
    
    
az.summary(grw_trace, hdi_prob = 0.95, var_names = ["~mu", "~beta_temp"], round_to = 2)
# .to_latex(float_format="{:.2f}".format)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_hum, beta_windspeed, beta_holiday, beta_workingday, beta_weathersit, nu, sigma, step_size, beta_temp]
100.00% [48000/48000 11:44<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 10_000 draw iterations (8_000 + 40_000 draws total) took 720 seconds.
Sampling: [likelihood]
100.00% [40000/40000 00:07<00:00]
Out[24]:
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
intercept 0.01 1.00 -1.90 2.00 0.0 0.01 68313.45 27093.47 1.0
beta_hum -0.02 0.02 -0.07 0.02 0.0 0.00 2196.25 6494.89 1.0
beta_windspeed -0.12 0.03 -0.18 -0.05 0.0 0.00 8540.76 22181.04 1.0
step_size 0.06 0.01 0.05 0.08 0.0 0.00 1538.21 3184.50 1.0
beta_holiday[0] 0.06 0.01 0.04 0.08 0.0 0.00 12452.50 23847.61 1.0
beta_holiday[1] -0.06 0.01 -0.08 -0.04 0.0 0.00 12452.50 23847.61 1.0
beta_workingday[0] -0.00 0.00 -0.01 0.00 0.0 0.00 40569.27 32970.31 1.0
beta_workingday[1] 0.00 0.00 -0.00 0.01 0.0 0.00 40569.27 32970.31 1.0
beta_weathersit[0] 0.05 0.01 0.04 0.07 0.0 0.00 33374.49 33344.24 1.0
beta_weathersit[1] 0.11 0.01 0.10 0.13 0.0 0.00 6173.08 16805.17 1.0
beta_weathersit[2] -0.17 0.01 -0.19 -0.14 0.0 0.00 11504.30 25434.36 1.0
nu 2.67 0.43 1.90 3.53 0.0 0.00 19169.62 30276.45 1.0
sigma 0.04 0.00 0.04 0.05 0.0 0.00 5341.62 14184.22 1.0

Posterior Predictive Checks¶

In [25]:
plot_posterior_predictive(grw_ppc_base, 'Gaussian Random Walk Model: All Features With ZeroSumCategorical Priors', model_name = 'grw_reg_all_feats_zsn', save_fig = True)
In [26]:
def rev_min_max_func(scaled_arr, min_val, max_val):
    og_val = (scaled_arr*(max_val - min_val)) + min_val
    return og_val

unscaled_temp = rev_min_max_func(temp, -8, 39)
beta_coeffs = grw_trace['posterior']['beta_temp']

# Code referenced from: https://www.pymc.io/projects/examples/en/latest/generalized_linear_models/GLM-rolling-regression.html#analysis-of-results
fig, ax = plt.subplots(figsize=(12, 6))
az.plot_hdi(
    x=unscaled_temp,
    y=beta_coeffs,
    hdi_prob=0.95,
    color="C0",
    fill_kwargs={"alpha": 0.4, "label": "95% HDI"},
    ax=ax,
)
az.plot_hdi(
    x=unscaled_temp,
    y=beta_coeffs,
    hdi_prob=0.5,
    color="C0",
    fill_kwargs={"alpha": 0.6, "label": "50% HDI"},
    ax=ax,
)
ax.axhline(
    y=full_trace_2['posterior']['beta_temp'].mean(dim=["chain", "draw"]).item(),
    color="blue",
    linestyle="--",
    label="Linear Regression (All Features) Posterior Predictive Mean",
)
sns.rugplot(x=unscaled_temp, color="black", alpha=0.5, ax=ax)
ax.axhline(y=0.0, color="gray", linestyle="--")
ax.legend(loc="upper right")
ax.set(
    title="Temperature Coefficient (Original Scale) - Posterior Predictive",
    xlabel="temp",
    ylabel="Beta Temperature",
)
plt.tight_layout()
save_fig = True
if save_fig:
    plt.savefig('gaussian_rw_temp_coeffs.png')
    
plt.show()
In [27]:
axes = az.plot_trace(
    data=grw_trace,
    var_names=["~mu", "~beta_temp"],
    compact=True,
    kind="rank_bars",
    backend_kwargs={"figsize": (12, 15), "layout": "constrained"},
)
plt.gcf().suptitle("Gaussian Random Walk Model", fontsize=16)
plt.show()

Model Comparison¶

Pareto smoothed importance sampling leave-one-out cross-validation (LOO)¶

In [28]:
models = {
    "baseline_student_t": base_trace, 
    "baseline_normal": base_trace_normal, 
    "full_model_norm": full_trace_1, 
    "full_model_zsn": full_trace_2, 
    "gaussian_rw": grw_trace,
}
az.compare(models, scale="deviance", method="stacking").round(3)
# az.compare(models, scale="deviance", method="stacking").to_latex(float_format="{:.3f}".format)
Out[28]:
rank elpd_loo p_loo elpd_diff weight se dse warning scale
gaussian_rw 0 -1651.401 219.114 0.000 0.959 56.159 0.000 False deviance
full_model_norm 1 -571.490 7.552 1079.912 0.041 32.156 59.342 False deviance
full_model_zsn 2 -571.236 7.581 1080.165 0.000 32.142 59.326 False deviance
baseline_normal 3 -481.438 2.537 1169.963 0.000 32.504 58.355 False deviance
baseline_student_t 4 -458.049 2.673 1193.352 0.000 33.419 59.223 False deviance

Laud-Ibrahim criterion¶

In [29]:
def compute_laud_ibrahim_criteria(trace, scaled_y):
    """ Code referenced from: https://areding.github.io/6420-pymc/unit9/Unit9-hald.html?highlight=ibrahim
    """
    Y_new = az.summary(trace.posterior_predictive)["mean"].values
    D2 = (scaled_y - Y_new) ** 2
    L = np.sqrt(np.sum(D2, axis=0) + np.std(Y_new, axis=0) ** 2)
    
    return L
    
model_scores = dict(zip(models.keys(), [0] * len(models)))

for model_name, trace_ in models.items():
    model_scores[model_name] = compute_laud_ibrahim_criteria(trace_, rentals_scaled)

model_scores
Out[29]:
{'baseline_student_t': 4.6919201801878,
 'baseline_normal': 4.6920673536411135,
 'full_model_norm': 4.363369420078653,
 'full_model_zsn': 4.363611454140253,
 'gaussian_rw': 1.9000069698310282}
In [30]:
model_names = list(model_scores.keys())
scores = [model_scores[k] for k in model_names]

colors = ["grey" if m > np.min(scores) else "green" for m in scores]
fig = plt.Figure(figsize=(7,4))
sns.barplot(x=model_names, y=scores, palette = colors)
plt.title('Model Comparison: Laud-Ibrahim Criteria')
plt.ylabel("Score")

save_fig = True
if save_fig:
    plt.savefig('model_comparison.png')
    
plt.show()
In [31]:
def compute_r2_score(ppc_trace, y):
    y_pred = ppc_trace.posterior_predictive.stack(sample=("chain", "draw"))["likelihood"].values.T
    return az.r2_score(y, y_pred).values[0].round(3)


models_ppc = {
    "baseline_student_t": ppc_base, 
    "baseline_normal": ppc_base_normal, 
    "full_model_norm": ppc_full_1, 
    "full_model_zsn": ppc_full_2, 
    "gaussian_rw": grw_ppc_base
}
model_r2_scores = dict(zip(models_ppc.keys(), [0] * len(models_ppc)))

for model_name, ppc_ in models_ppc.items():
    model_r2_scores[model_name] = compute_r2_score(ppc_, rentals_scaled)
    
r2_df = pd.DataFrame.from_dict(model_r2_scores, orient='index', columns =['R2 Score']).T
# r2_df.to_latex(float_format="{:.3f}".format)
r2_df
Out[31]:
baseline_student_t baseline_normal full_model_norm full_model_zsn gaussian_rw
R2 Score 0.455 0.45 0.486 0.486 0.803

Parameter Analysis¶

In [32]:
axes = az.plot_trace(
    data=full_trace_2,
    var_names=["~mu"],
    compact=True,
    kind="rank_bars",
    backend_kwargs={"figsize": (12, 15), "layout": "constrained"},
)
plt.gcf().suptitle("Full Model With ZeroSumCategorical Priors - Trace")
plt.show()
In [33]:
axes = az.plot_forest(
    data=grw_trace,
    var_names=["~mu", "~nu", "~sigma", "~beta_temp"],
    combined=True,
    r_hat=True,
    ess=True,
    figsize=(10, 6),
    hdi_prob = 0.95
)
axes[0].axvline(x=0.0, color="black", linestyle="--")
plt.gcf().suptitle("Gaussian Random Walk Model")
plt.show()

Multivariate Regression: Negative Binomial Likelihood¶

In [34]:
rng = np.random.default_rng(42)

coords = {
    "date": date,
    "workingday": workingday,
    "weathersit": weathersit,
    "holiday": holiday,
}
with pm.Model(coords=coords) as nb_model:
    # Intercept Prior
    intercept = pm.Normal(name="intercept", mu = 0, sigma = 1)
    
    # Beta Coefficient Priors (Informative) - Numeric 
    # Temp: https://www.weather.gov/media/lwx/climate/dcatemps.pdf
    beta_temp = pm.Normal("beta_temp", mu = 0.5, sigma = 0.2)
    # Humidity: https://www.worlddata.info/america/usa/climate-washington-d-c.php
    beta_hum = pm.Normal("beta_hum", mu = 0.5, sigma = 0.1)
    # https://weatherspark.com/y/20957/Average-Weather-in-Washington-D.C.;-United-States-Year-Round#Sections-Wind
    beta_windspeed = pm.Normal("beta_windspeed", mu = 0.2, sigma = .2)
    
    # Beta Coefficient Priors - Categorical
    beta_holiday = pm.ZeroSumNormal("beta_holiday", sigma=1, dims="holiday")
    beta_workingday = pm.ZeroSumNormal("beta_workingday", sigma=1, dims="workingday")
    beta_weathersit = pm.ZeroSumNormal("beta_weathersit", sigma=1, dims="weathersit")
    
    alpha = pm.Exponential("alpha", 10)
    lambda_ = pm.math.exp(
        intercept
        + beta_temp * temp
        + beta_hum * hum
        + beta_windspeed * windspeed
        + beta_holiday[holiday_idx]
        + beta_workingday[workingday_idx]
        + beta_weathersit[weathersit_idx]        
    )

    # Use NB distribution since data is quite dispersed
    likelihood = pm.NegativeBinomial("likelihood", mu = lambda_, alpha = alpha, observed = rentals_scaled, dims = "date")

    nb_trace = pm.sample(
        draws = 5000,
        chains = 4,
        tune = 1000,
        cores = 4,
        target_accept=0.95, # getting some divergences when not set
        idata_kwargs = {"log_likelihood": True},
        random_seed = rng,
    )
    prior_pred_nb = pm.sample_prior_predictive(samples = 1000, random_seed = rng)
    ppc_nb = pm.sample_posterior_predictive(
        trace = nb_trace, random_seed = rng, extend_inferencedata = True,
    )
    
    
az.summary(nb_trace, hdi_prob = 0.95, var_names = ["~mu"], round_to = 3)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_temp, beta_hum, beta_windspeed, beta_holiday, beta_workingday, beta_weathersit, alpha]
100.00% [24000/24000 00:29<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 49 seconds.
Sampling: [alpha, beta_holiday, beta_hum, beta_temp, beta_weathersit, beta_windspeed, beta_workingday, intercept, likelihood]
Sampling: [likelihood]
100.00% [20000/20000 00:05<00:00]
Out[34]:
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
intercept -0.356 1.041 -2.415 1.658 0.007 0.007 21198.709 13557.411 1.0
beta_temp 0.491 0.200 0.107 0.890 0.001 0.001 24615.140 15856.004 1.0
beta_hum 0.498 0.100 0.308 0.695 0.001 0.000 28101.113 15475.412 1.0
beta_windspeed 0.193 0.201 -0.199 0.592 0.001 0.001 23984.636 14524.279 1.0
beta_holiday[0] -0.333 1.042 -2.357 1.711 0.006 0.007 26113.399 15965.030 1.0
beta_holiday[1] 0.333 1.042 -1.711 2.357 0.006 0.007 26113.399 15965.030 1.0
beta_workingday[0] 0.147 1.000 -1.749 2.142 0.007 0.007 23442.313 14708.537 1.0
beta_workingday[1] -0.147 1.000 -2.142 1.749 0.007 0.007 23442.313 14708.537 1.0
beta_weathersit[2] -0.014 1.011 -1.960 1.984 0.006 0.007 25341.292 15910.256 1.0
beta_weathersit[1] -0.150 1.012 -2.137 1.848 0.007 0.007 21584.289 15200.033 1.0
beta_weathersit[3] 0.164 1.011 -1.802 2.172 0.007 0.007 23618.086 14431.092 1.0
alpha 0.001 0.002 0.000 0.001 0.000 0.000 13931.740 11538.681 1.0

Posterior & Prior Predictive Checks¶

In [35]:
plot_posterior_predictive(nb_trace, 'Negative Binomial Model: All Features With ZeroSumCategorical Priors', model_name = 'nb_reg_posterior_fit_all_feats_zsn', save_fig = False)